############################################################ ########### #
#
# Project: Stability and change in the opinion-policy relationship
# 
# This cleans census data
# 
# 2022.09.30. 
############################################################ ########### #

library(rio)
library(tidyverse)

raw <- import("A-original-data/census_13-15-18-19.csv")

# copy statecode - statename combinations from codebook

states_abb <- data.frame(
        state_codes = c(01, 02, 04, 05, 06, 08, 09, 10, 11, 12, 13, 15, 16, 17, 18, 
                        19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 
                        34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49,
                        50, 51, 53, 54, 55, 56), 
        state = c("Alabama", "Alaska", "Arizona", "Arkansas", "California", 
                  "Colorado", "Connecticut", "Delaware", "District of Columbia", 
                  "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", 
                  "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", 
                  "Massachusetts", "Michigan", "Minnesota", "Mississippi",
                  "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", 
                  "New Jersey", "New Mexico", "New York", "North Carolina", 
                  "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", 
                  "Rhode Island", "South Carolina", "South Dakota", "Tennessee", 
                  "Texas", "Utah", "Vermont", "Virginia", "Washington", 
                  "West Virginia", "Wisconsin", "Wyoming"))

# clean up data and generate counts by strata
df <- raw %>% 
        # filter out minors
        filter(AGE >= 18) %>% 
        # create a clean dataframe
        transmute(year = YEAR,
                  # grouping age into 4 groups. 18-29-44-64- 
                  age = as.numeric(cut(AGE, 
                                       breaks = c(18, 29, 44, 64, 
                                                  max(AGE, na.rm = TRUE)), 
                                       labels = 1:4, 
                                       include.lowest = TRUE)),
                  female = SEX - 1, 
                  # grouping education into 5 groups
                  edu = case_when(EDUCD < 61 ~ "nohs", 
                                  EDUCD >= 62 & EDUCD <= 64 ~ "hs", 
                                  EDUCD >= 65 & EDUCD <= 100 ~ "somecol", 
                                  EDUCD <= 101 & EDUCD <= 113 ~ "col", 
                                  EDUCD >= 114 ~ "postgrad", 
                                  TRUE ~ NA_character_),
                  # (not not) hispanic and not black = hisp 3
                  # black (even if also hispanic) = black 2
                  # white and other = 1
                  race = case_when(HISPAN != 0 & RACBLK == 1 ~ 2 , 
                                   RACBLK == 2 ~ 1, 
                                   TRUE ~ 0),
                  state_codes = STATEFIP,
                  count = PERWT
        ) %>% 
        group_by(year, age, female, edu, race, state_codes) %>% 
        summarise(freq=sum(count)) %>% 
        ungroup() %>% 
        complete(year, age, female, edu, race, state_codes, 
                 fill = list(freq = 0)) %>%
        left_join(states_abb) %>% 
        glimpse

# expected number of strata
# age = 4, female = 2, edu = 5, race = 3, state_codes = 51
# 4 * 2 * 5 * 3 * 51 # = 6120 
stopifnot(table(df$year) == 6120)

saveRDS(df, "B-analysis-data/census_13-19_raw.rds")

